#Importing the data and files
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import seaborn as sns
from scipy import stats; from scipy.stats import zscore, norm, randint
import warnings
warnings.filterwarnings("ignore")
data= pd.read_csv('IHMStefanini_industrial_safety_and_health_database_with_accidents_description.csv')
print("Shape of the dataset is :",data.shape)
data.head()
Shape of the dataset is : (425, 11)
| Unnamed: 0 | Data | Countries | Local | Industry Sector | Accident Level | Potential Accident Level | Genre | Employee or Third Party | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2016-01-01 00:00:00 | Country_01 | Local_01 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... |
| 1 | 1 | 2016-01-02 00:00:00 | Country_02 | Local_02 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... |
| 2 | 2 | 2016-01-06 00:00:00 | Country_01 | Local_03 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... |
| 3 | 3 | 2016-01-08 00:00:00 | Country_01 | Local_04 | Mining | I | I | Male | Third Party | Others | Being 9:45 am. approximately in the Nv. 1880 C... |
| 4 | 4 | 2016-01-10 00:00:00 | Country_01 | Local_04 | Mining | IV | IV | Male | Third Party | Others | Approximately at 11:45 a.m. in circumstances t... |
data.drop("Unnamed: 0", axis=1, inplace=True)
data.isnull().sum()
Data 0 Countries 0 Local 0 Industry Sector 0 Accident Level 0 Potential Accident Level 0 Genre 0 Employee or Third Party 0 Critical Risk 0 Description 0 dtype: int64
print("Shape of the dataset before duplicates deletion is :",data.shape)
print('Number of duplicates in the dataset :',data.duplicated().sum())
data.drop_duplicates(inplace=True)
print("Shape of the dataset after duplicates deletion is :",data.shape)
Shape of the dataset before duplicates deletion is : (425, 10) Number of duplicates in the dataset : 7 Shape of the dataset after duplicates deletion is : (418, 10)
print('********Checking the dtypes*********\n')
print(data.dtypes)
print('----------------------------------------------')
print('\n *********Checking the data info********* \n')
print(data.info())
********Checking the dtypes********* Data object Countries object Local object Industry Sector object Accident Level object Potential Accident Level object Genre object Employee or Third Party object Critical Risk object Description object dtype: object ---------------------------------------------- *********Checking the data info********* <class 'pandas.core.frame.DataFrame'> Int64Index: 418 entries, 0 to 424 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Data 418 non-null object 1 Countries 418 non-null object 2 Local 418 non-null object 3 Industry Sector 418 non-null object 4 Accident Level 418 non-null object 5 Potential Accident Level 418 non-null object 6 Genre 418 non-null object 7 Employee or Third Party 418 non-null object 8 Critical Risk 418 non-null object 9 Description 418 non-null object dtypes: object(10) memory usage: 35.9+ KB None
From the above, it is clearly evident that all the columns of the data frame are of the type object.
data.describe()
| Data | Countries | Local | Industry Sector | Accident Level | Potential Accident Level | Genre | Employee or Third Party | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 418 | 418 | 418 | 418 | 418 | 418 | 418 | 418 | 418 | 418 |
| unique | 287 | 3 | 12 | 3 | 5 | 6 | 2 | 3 | 33 | 411 |
| top | 2017-02-08 00:00:00 | Country_01 | Local_03 | Mining | I | IV | Male | Third Party | Others | Project of Vazante that carried out sediment c... |
| freq | 6 | 248 | 89 | 237 | 309 | 141 | 396 | 185 | 229 | 2 |
#Renaming the data, countries,genre, employee or third party to date, country, gender and nature of employee
data.rename(columns={'Data':'Date', 'Countries':'Country', 'Genre':'Gender', 'Employee or Third Party':'Natureofemployee'}, inplace=True)
data.head(3)
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 00:00:00 | Country_01 | Local_01 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... |
| 1 | 2016-01-02 00:00:00 | Country_02 | Local_02 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... |
| 2 | 2016-01-06 00:00:00 | Country_01 | Local_03 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... |
col = data[data.columns[~data.columns.isin(['Date','Description'])]].columns.tolist()
for cols in col:
print(f'Unique values for {cols} is \n{data[cols].unique()}\n')
Unique values for Country is ['Country_01' 'Country_02' 'Country_03'] Unique values for Local is ['Local_01' 'Local_02' 'Local_03' 'Local_04' 'Local_05' 'Local_06' 'Local_07' 'Local_08' 'Local_10' 'Local_09' 'Local_11' 'Local_12'] Unique values for Industry Sector is ['Mining' 'Metals' 'Others'] Unique values for Accident Level is ['I' 'IV' 'III' 'II' 'V'] Unique values for Potential Accident Level is ['IV' 'III' 'I' 'II' 'V' 'VI'] Unique values for Gender is ['Male' 'Female'] Unique values for Natureofemployee is ['Third Party' 'Employee' 'Third Party (Remote)'] Unique values for Critical Risk is ['Pressed' 'Pressurized Systems' 'Manual Tools' 'Others' 'Fall prevention (same level)' 'Chemical substances' 'Liquid Metal' 'Electrical installation' 'Confined space' 'Pressurized Systems / Chemical Substances' 'Blocking and isolation of energies' 'Suspended Loads' 'Poll' 'Cut' 'Fall' 'Bees' 'Fall prevention' '\nNot applicable' 'Traffic' 'Projection' 'Venomous Animals' 'Plates' 'Projection/Burning' 'remains of choco' 'Vehicles and Mobile Equipment' 'Projection/Choco' 'Machine Protection' 'Power lock' 'Burn' 'Projection/Manual Tools' 'Individual protection equipment' 'Electrical Shock' 'Projection of fragments']
replace_val = {'Local_01': 1, 'Local_02': 2, 'Local_03': 3, 'Local_04': 4, 'Local_05': 5, 'Local_06': 6, 'Local_07': 7, 'Local_08': 8, 'Local_09': 9, 'Local_10': 10, 'Local_11': 11, 'Local_12': 12}
data['Local'] = data['Local'].map(replace_val)
# replace_val = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5}
# data['Accident Level'] = data['Accident Level'].map(replace_val)
# replace_val = {'I': 0, 'II': 1, 'III': 2, 'IV': 3, 'V': 4, 'VI': 5}
# data['Potential Accident Level'] = data['Potential Accident Level'].map(replace_val)
del replace_val
data.head(5)
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 00:00:00 | Country_01 | 1 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... |
| 1 | 2016-01-02 00:00:00 | Country_02 | 2 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... |
| 2 | 2016-01-06 00:00:00 | Country_01 | 3 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... |
| 3 | 2016-01-08 00:00:00 | Country_01 | 4 | Mining | I | I | Male | Third Party | Others | Being 9:45 am. approximately in the Nv. 1880 C... |
| 4 | 2016-01-10 00:00:00 | Country_01 | 4 | Mining | IV | IV | Male | Third Party | Others | Approximately at 11:45 a.m. in circumstances t... |
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].apply(lambda x : x.year)
data['Month'] = data['Date'].apply(lambda x : x.month)
data['Weekday'] = data['Date'].apply(lambda x : x.day_name())
data.head()
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | Year | Month | Weekday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 | Country_01 | 1 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... | 2016 | 1 | Friday |
| 1 | 2016-01-02 | Country_02 | 2 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... | 2016 | 1 | Saturday |
| 2 | 2016-01-06 | Country_01 | 3 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... | 2016 | 1 | Wednesday |
| 3 | 2016-01-08 | Country_01 | 4 | Mining | I | I | Male | Third Party | Others | Being 9:45 am. approximately in the Nv. 1880 C... | 2016 | 1 | Friday |
| 4 | 2016-01-10 | Country_01 | 4 | Mining | IV | IV | Male | Third Party | Others | Approximately at 11:45 a.m. in circumstances t... | 2016 | 1 | Sunday |
# function to create month into seasons
def convert_to_season(x):
if x in [9, 10, 11]:
season = 'Spring'
elif x in [12, 1, 2]:
season = 'Summer'
elif x in [3, 4, 5]:
season = 'Autumn'
elif x in [6, 7, 8]:
season = 'Winter'
return season
data['Season'] = data['Month'].apply(convert_to_season)
data.head(3)
| Date | Country | Local | Industry Sector | Accident Level | Potential Accident Level | Gender | Natureofemployee | Critical Risk | Description | Year | Month | Weekday | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2016-01-01 | Country_01 | 1 | Mining | I | IV | Male | Third Party | Pressed | While removing the drill rod of the Jumbo 08 f... | 2016 | 1 | Friday | Summer |
| 1 | 2016-01-02 | Country_02 | 2 | Mining | I | IV | Male | Employee | Pressurized Systems | During the activation of a sodium sulphide pum... | 2016 | 1 | Saturday | Summer |
| 2 | 2016-01-06 | Country_01 | 3 | Mining | I | III | Male | Third Party (Remote) | Manual Tools | In the sub-station MILPO located at level +170... | 2016 | 1 | Wednesday | Summer |
features=['Country', 'Local', 'Industry Sector', 'Accident Level',
'Potential Accident Level', 'Gender', 'Natureofemployee',
'Critical Risk', 'Year', 'Month', 'Weekday', 'Season']
def univariate_analysis_categorical(dataset,feature):
print("\n")
print("===========================================================================================")
print("Univariate Analysis of feature: ",feature)
print("===========================================================================================\n")
print("Unique values: ",feature)
print("-----------------")
print(dataset[feature].unique())
print("\n")
print("-----------------")
print("Countplot for feature: ",feature)
print("-----------------")
plt.figure(figsize=(10,6))
sns.countplot(dataset[feature],order = dataset[feature].value_counts().index)
plt.xticks(rotation = 'vertical')
plt.show()
print("-----------------")
print("Pie Chart for feature: ",feature)
print("------------------")
labels=dataset[feature].unique()
plt.figure(figsize=(10,6))
dataset[feature].value_counts().plot.pie(autopct="%.1f%%")
plt.show()
print("\n")
print("-----------------")
print("Histplot for feature: ",feature)
print("-------------------")
plt.figure(figsize=(10,6))
sns.histplot(dataset[feature])
plt.show()
print("\n")
print("-----------------")
print("Value Counts for feature: ",feature)
print("-------------------")
print(dataset[feature].value_counts().sort_values(ascending=False))
#!pip install pyqt5
univariate_analysis_categorical(data,'Country')
=========================================================================================== Univariate Analysis of feature: Country =========================================================================================== Unique values: Country ----------------- ['Country_01' 'Country_02' 'Country_03'] ----------------- Countplot for feature: Country -----------------
----------------- Pie Chart for feature: Country ------------------
----------------- Histplot for feature: Country -------------------
----------------- Value Counts for feature: Country ------------------- Country_01 248 Country_02 129 Country_03 41 Name: Country, dtype: int64
----- From the above plots, we can conclude the following
The country_01 has a count of about 248. Country _02 has a count of about 129. Country_03 has a count of about 41.
From the above pie chart, it can be infered that the country _01 is the most affected country with about 59% accidents and country_03 is the least affected country.
From the above output, the country_01 has maximum accidents and country_03 has minimum accidents.
#Count plot
univariate_analysis_categorical(data,'Local')
=========================================================================================== Univariate Analysis of feature: Local =========================================================================================== Unique values: Local ----------------- [ 1 2 3 4 5 6 7 8 10 9 11 12] ----------------- Countplot for feature: Local -----------------
----------------- Pie Chart for feature: Local ------------------
----------------- Histplot for feature: Local -------------------
----------------- Value Counts for feature: Local ------------------- 3 89 5 59 1 56 4 55 6 46 10 41 8 27 2 23 7 14 12 4 9 2 11 2 Name: Local, dtype: int64
univariate_analysis_categorical(data,'Industry Sector')
=========================================================================================== Univariate Analysis of feature: Industry Sector =========================================================================================== Unique values: Industry Sector ----------------- ['Mining' 'Metals' 'Others'] ----------------- Countplot for feature: Industry Sector -----------------
----------------- Pie Chart for feature: Industry Sector ------------------
----------------- Histplot for feature: Industry Sector -------------------
----------------- Value Counts for feature: Industry Sector ------------------- Mining 237 Metals 134 Others 47 Name: Industry Sector, dtype: int64
univariate_analysis_categorical(data,'Accident Level')
=========================================================================================== Univariate Analysis of feature: Accident Level =========================================================================================== Unique values: Accident Level ----------------- ['I' 'IV' 'III' 'II' 'V'] ----------------- Countplot for feature: Accident Level -----------------
----------------- Pie Chart for feature: Accident Level ------------------
----------------- Histplot for feature: Accident Level -------------------
----------------- Value Counts for feature: Accident Level ------------------- I 309 II 40 III 31 IV 30 V 8 Name: Accident Level, dtype: int64
univariate_analysis_categorical(data,'Potential Accident Level')
=========================================================================================== Univariate Analysis of feature: Potential Accident Level =========================================================================================== Unique values: Potential Accident Level ----------------- ['IV' 'III' 'I' 'II' 'V' 'VI'] ----------------- Countplot for feature: Potential Accident Level -----------------
----------------- Pie Chart for feature: Potential Accident Level ------------------
----------------- Histplot for feature: Potential Accident Level -------------------
----------------- Value Counts for feature: Potential Accident Level ------------------- IV 141 III 106 II 95 I 45 V 30 VI 1 Name: Potential Accident Level, dtype: int64
univariate_analysis_categorical(data,'Gender')
=========================================================================================== Univariate Analysis of feature: Gender =========================================================================================== Unique values: Gender ----------------- ['Male' 'Female'] ----------------- Countplot for feature: Gender -----------------
----------------- Pie Chart for feature: Gender ------------------
----------------- Histplot for feature: Gender -------------------
----------------- Value Counts for feature: Gender ------------------- Male 396 Female 22 Name: Gender, dtype: int64
univariate_analysis_categorical(data,'Natureofemployee')
=========================================================================================== Univariate Analysis of feature: Natureofemployee =========================================================================================== Unique values: Natureofemployee ----------------- ['Third Party' 'Employee' 'Third Party (Remote)'] ----------------- Countplot for feature: Natureofemployee -----------------
----------------- Pie Chart for feature: Natureofemployee ------------------
----------------- Histplot for feature: Natureofemployee -------------------
----------------- Value Counts for feature: Natureofemployee ------------------- Third Party 185 Employee 178 Third Party (Remote) 55 Name: Natureofemployee, dtype: int64
From the above it can be determined that the employee type of Third party are prone to accidents.
#Count plot
# plt.figure(figsize=(20,5))
# descending_order = data['Critical Risk'].value_counts().sort_values(ascending=False).index
# sns.countplot(x=data['Critical Risk'],order=descending_order);
# plt.xticks(rotation = 'vertical')
univariate_analysis_categorical(data,'Critical Risk')
=========================================================================================== Univariate Analysis of feature: Critical Risk =========================================================================================== Unique values: Critical Risk ----------------- ['Pressed' 'Pressurized Systems' 'Manual Tools' 'Others' 'Fall prevention (same level)' 'Chemical substances' 'Liquid Metal' 'Electrical installation' 'Confined space' 'Pressurized Systems / Chemical Substances' 'Blocking and isolation of energies' 'Suspended Loads' 'Poll' 'Cut' 'Fall' 'Bees' 'Fall prevention' '\nNot applicable' 'Traffic' 'Projection' 'Venomous Animals' 'Plates' 'Projection/Burning' 'remains of choco' 'Vehicles and Mobile Equipment' 'Projection/Choco' 'Machine Protection' 'Power lock' 'Burn' 'Projection/Manual Tools' 'Individual protection equipment' 'Electrical Shock' 'Projection of fragments'] ----------------- Countplot for feature: Critical Risk -----------------
----------------- Pie Chart for feature: Critical Risk ------------------
----------------- Histplot for feature: Critical Risk -------------------
----------------- Value Counts for feature: Critical Risk ------------------- Others 229 Pressed 24 Manual Tools 20 Chemical substances 17 Cut 14 Venomous Animals 13 Projection 13 Bees 10 Fall 9 Vehicles and Mobile Equipment 8 Fall prevention (same level) 7 Pressurized Systems 7 remains of choco 7 Suspended Loads 6 Fall prevention 6 Power lock 3 Pressurized Systems / Chemical Substances 3 Blocking and isolation of energies 3 Liquid Metal 3 Electrical Shock 2 Machine Protection 2 Individual protection equipment 1 Projection of fragments 1 Burn 1 Confined space 1 Projection/Burning 1 Projection/Choco 1 \nNot applicable 1 Plates 1 Poll 1 Electrical installation 1 Traffic 1 Projection/Manual Tools 1 Name: Critical Risk, dtype: int64
When we count the number of incidents by each type of critical risk, Others tops the list.
univariate_analysis_categorical(data,'Year')
=========================================================================================== Univariate Analysis of feature: Year =========================================================================================== Unique values: Year ----------------- [2016 2017] ----------------- Countplot for feature: Year -----------------
----------------- Pie Chart for feature: Year ------------------
----------------- Histplot for feature: Year -------------------
----------------- Value Counts for feature: Year ------------------- 2016 283 2017 135 Name: Year, dtype: int64
From the above, it is clearly evident that most accidents happend in year 2016. i.e- more than 250.
univariate_analysis_categorical(data,'Month')
=========================================================================================== Univariate Analysis of feature: Month =========================================================================================== Unique values: Month ----------------- [ 1 2 3 4 5 6 7 8 9 10 11 12] ----------------- Countplot for feature: Month -----------------
----------------- Pie Chart for feature: Month ------------------
----------------- Histplot for feature: Month -------------------
----------------- Value Counts for feature: Month ------------------- 2 61 4 51 6 51 3 50 5 40 1 39 7 24 9 24 12 23 8 21 10 21 11 13 Name: Month, dtype: int64
univariate_analysis_categorical(data,'Weekday')
=========================================================================================== Univariate Analysis of feature: Weekday =========================================================================================== Unique values: Weekday ----------------- ['Friday' 'Saturday' 'Wednesday' 'Sunday' 'Tuesday' 'Thursday' 'Monday'] ----------------- Countplot for feature: Weekday -----------------
----------------- Pie Chart for feature: Weekday ------------------
----------------- Histplot for feature: Weekday -------------------
----------------- Value Counts for feature: Weekday ------------------- Thursday 76 Tuesday 69 Wednesday 62 Friday 61 Saturday 56 Monday 53 Sunday 41 Name: Weekday, dtype: int64
sns.countplot(x="Accident Level",hue="Gender", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Gender'])
print("------------------------------------------")
print("Cross table Analysis of features: ",'Accident Level',' and ', 'Gender')
print("------------------------------------------")
display(bivariate_analysis_df)
------------------------------------------ Cross table Analysis of features: Accident Level and Gender ------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Accident Level | ||
| I | 18 | 291 |
| II | 3 | 37 |
| III | 1 | 30 |
| IV | 0 | 30 |
| V | 0 | 8 |
From the above count plot, it can be determined that the most of the accidents happened at level I with gender male.
sns.countplot(x="Potential Accident Level",hue="Gender", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Gender')
print("--------------------------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Gender --------------------------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Potential Accident Level | ||
| I | 0 | 45 |
| II | 14 | 81 |
| III | 3 | 103 |
| IV | 4 | 137 |
| V | 1 | 29 |
| VI | 0 | 1 |
From the above,it can be determined that most of the potential level accidents happened to male compared to female, of which Potential Accident Level of IV is dominant
sns.countplot(x="Country",hue="Gender", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Country | ||
| Country_01 | 7 | 241 |
| Country_02 | 15 | 114 |
| Country_03 | 0 | 41 |
From the above countplot, it can be determined that the maximum number of accidents took place in country_01 to males and they are about 241.
#count plot to determine the number of accidents happened due to industry sector with their gender
sns.countplot(x="Industry Sector",hue="Gender", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Industry Sector | ||
| Metals | 13 | 121 |
| Mining | 5 | 232 |
| Others | 4 | 43 |
From the above count plot, it is evident that most of the accidents happened to Male in the mining sector, around 232.
#Countplot to find in which max accidents took place to both female and male
sns.countplot(x="Year",hue="Gender", data=data)
<AxesSubplot:xlabel='Year', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Year',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Year and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Year | ||
| 2016 | 14 | 269 |
| 2017 | 8 | 127 |
From the above countplot, it is clearly evident that maximum accidents took place in 2016 to the male when compared to female with a count of 269.
#Countplot to determine in which month the maximum accidents took place to both female and males
sns.countplot(x="Month",hue="Gender", data=data)
<AxesSubplot:xlabel='Month', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Month'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Month',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Month and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Month | ||
| 1 | 0 | 39 |
| 2 | 4 | 57 |
| 3 | 1 | 49 |
| 4 | 1 | 50 |
| 5 | 4 | 36 |
| 6 | 1 | 50 |
| 7 | 4 | 20 |
| 8 | 3 | 18 |
| 9 | 3 | 21 |
| 10 | 0 | 21 |
| 11 | 0 | 13 |
| 12 | 1 | 22 |
From the above count plot, it is determined that maximum number of accidents happened to male in the month feb with a count 57.
#Countplot to find out on which day max accidents took place to the both genders
sns.countplot(x="Weekday",hue="Gender", data=data)
<AxesSubplot:xlabel='Weekday', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Weekday'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Weekday',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Weekday and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Weekday | ||
| Friday | 2 | 59 |
| Monday | 4 | 49 |
| Saturday | 1 | 55 |
| Sunday | 2 | 39 |
| Thursday | 3 | 73 |
| Tuesday | 6 | 63 |
| Wednesday | 4 | 58 |
Max accidents happened to male on thursday with a count of more than 73
#count plot to determine which type of employee and gender faced most of the accidents
sns.countplot(x="Natureofemployee",hue="Gender", data=data)
<AxesSubplot:xlabel='Natureofemployee', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Natureofemployee'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Natureofemployee',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Natureofemployee and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Natureofemployee | ||
| Employee | 8 | 170 |
| Third Party | 9 | 176 |
| Third Party (Remote) | 5 | 50 |
From the above output, it is clearly evident that maximum accidents happened to third party male employees. i.e- 176.
#count plot to determine which type of employee and gender faced most of the accidents
sns.countplot(x="Critical Risk",hue="Gender", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]),
[Text(0, 0, 'Pressed'),
Text(1, 0, 'Pressurized Systems'),
Text(2, 0, 'Manual Tools'),
Text(3, 0, 'Others'),
Text(4, 0, 'Fall prevention (same level)'),
Text(5, 0, 'Chemical substances'),
Text(6, 0, 'Liquid Metal'),
Text(7, 0, 'Electrical installation'),
Text(8, 0, 'Confined space'),
Text(9, 0, 'Pressurized Systems / Chemical Substances'),
Text(10, 0, 'Blocking and isolation of energies'),
Text(11, 0, 'Suspended Loads'),
Text(12, 0, 'Poll'),
Text(13, 0, 'Cut'),
Text(14, 0, 'Fall'),
Text(15, 0, 'Bees'),
Text(16, 0, 'Fall prevention'),
Text(17, 0, '\nNot applicable'),
Text(18, 0, 'Traffic'),
Text(19, 0, 'Projection'),
Text(20, 0, 'Venomous Animals'),
Text(21, 0, 'Plates'),
Text(22, 0, 'Projection/Burning'),
Text(23, 0, 'remains of choco'),
Text(24, 0, 'Vehicles and Mobile Equipment'),
Text(25, 0, 'Projection/Choco'),
Text(26, 0, 'Machine Protection'),
Text(27, 0, 'Power lock'),
Text(28, 0, 'Burn'),
Text(29, 0, 'Projection/Manual Tools'),
Text(30, 0, 'Individual protection equipment'),
Text(31, 0, 'Electrical Shock'),
Text(32, 0, 'Projection of fragments')])
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Gender'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Gender')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Gender ------------------------------------------------------
| Gender | Female | Male |
|---|---|---|
| Critical Risk | ||
| \nNot applicable | 0 | 1 |
| Bees | 0 | 10 |
| Blocking and isolation of energies | 0 | 3 |
| Burn | 0 | 1 |
| Chemical substances | 4 | 13 |
| Confined space | 0 | 1 |
| Cut | 3 | 11 |
| Electrical Shock | 0 | 2 |
| Electrical installation | 0 | 1 |
| Fall | 1 | 8 |
| Fall prevention | 0 | 6 |
| Fall prevention (same level) | 1 | 6 |
| Individual protection equipment | 0 | 1 |
| Liquid Metal | 0 | 3 |
| Machine Protection | 0 | 2 |
| Manual Tools | 1 | 19 |
| Others | 9 | 220 |
| Plates | 0 | 1 |
| Poll | 0 | 1 |
| Power lock | 0 | 3 |
| Pressed | 0 | 24 |
| Pressurized Systems | 1 | 6 |
| Pressurized Systems / Chemical Substances | 0 | 3 |
| Projection | 0 | 13 |
| Projection of fragments | 0 | 1 |
| Projection/Burning | 0 | 1 |
| Projection/Choco | 0 | 1 |
| Projection/Manual Tools | 0 | 1 |
| Suspended Loads | 0 | 6 |
| Traffic | 1 | 0 |
| Vehicles and Mobile Equipment | 0 | 8 |
| Venomous Animals | 1 | 12 |
| remains of choco | 0 | 7 |
Critical Risk of type "Others" is dominant across both Male and Female Genders
#Countplot to determine the accident level happened at different industry sectors
sns.countplot(x="Industry Sector",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Accident Level ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Industry Sector | |||||
| Metals | 107 | 12 | 7 | 7 | 1 |
| Mining | 163 | 26 | 20 | 21 | 7 |
| Others | 39 | 2 | 4 | 2 | 0 |
Maximum number of accidents happened in the mining sector with accident Level I. i.e- 163.
#Countplot to determine the potential accident level according to the industry sector
sns.countplot(x="Industry Sector",hue="Potential Accident Level", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Potential Accident Level'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Potential Accident Level ------------------------------------------------------
| Potential Accident Level | I | II | III | IV | V | VI |
|---|---|---|---|---|---|---|
| Industry Sector | ||||||
| Metals | 7 | 48 | 44 | 33 | 2 | 0 |
| Mining | 8 | 40 | 61 | 99 | 28 | 1 |
| Others | 30 | 7 | 1 | 9 | 0 | 0 |
Maximum number of accidents happened in the potential accident level 4 and mining sector with a count 99. Minimum number of accidents took place in the mining sector at a potential accident level 6.
#Countplot to determine the number of accidents taken place at the industry sector wrt critical risk
fig = plt.figure(figsize = (15, 7.2))
ax = fig.add_subplot(121)
sns.countplot(x = 'Critical Risk', data = data, ax = ax, orient = 'v',
hue = 'Industry Sector')
plt.legend(labels = data['Industry Sector'].unique())
plt.xticks(rotation = 90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]),
[Text(0, 0, 'Pressed'),
Text(1, 0, 'Pressurized Systems'),
Text(2, 0, 'Manual Tools'),
Text(3, 0, 'Others'),
Text(4, 0, 'Fall prevention (same level)'),
Text(5, 0, 'Chemical substances'),
Text(6, 0, 'Liquid Metal'),
Text(7, 0, 'Electrical installation'),
Text(8, 0, 'Confined space'),
Text(9, 0, 'Pressurized Systems / Chemical Substances'),
Text(10, 0, 'Blocking and isolation of energies'),
Text(11, 0, 'Suspended Loads'),
Text(12, 0, 'Poll'),
Text(13, 0, 'Cut'),
Text(14, 0, 'Fall'),
Text(15, 0, 'Bees'),
Text(16, 0, 'Fall prevention'),
Text(17, 0, '\nNot applicable'),
Text(18, 0, 'Traffic'),
Text(19, 0, 'Projection'),
Text(20, 0, 'Venomous Animals'),
Text(21, 0, 'Plates'),
Text(22, 0, 'Projection/Burning'),
Text(23, 0, 'remains of choco'),
Text(24, 0, 'Vehicles and Mobile Equipment'),
Text(25, 0, 'Projection/Choco'),
Text(26, 0, 'Machine Protection'),
Text(27, 0, 'Power lock'),
Text(28, 0, 'Burn'),
Text(29, 0, 'Projection/Manual Tools'),
Text(30, 0, 'Individual protection equipment'),
Text(31, 0, 'Electrical Shock'),
Text(32, 0, 'Projection of fragments')])
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Industry Sector'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Industry Sector')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Industry Sector ------------------------------------------------------
| Industry Sector | Metals | Mining | Others |
|---|---|---|---|
| Critical Risk | |||
| \nNot applicable | 1 | 0 | 0 |
| Bees | 0 | 0 | 10 |
| Blocking and isolation of energies | 3 | 0 | 0 |
| Burn | 1 | 0 | 0 |
| Chemical substances | 15 | 2 | 0 |
| Confined space | 1 | 0 | 0 |
| Cut | 10 | 4 | 0 |
| Electrical Shock | 0 | 2 | 0 |
| Electrical installation | 0 | 1 | 0 |
| Fall | 2 | 5 | 2 |
| Fall prevention | 3 | 2 | 1 |
| Fall prevention (same level) | 6 | 1 | 0 |
| Individual protection equipment | 0 | 1 | 0 |
| Liquid Metal | 3 | 0 | 0 |
| Machine Protection | 2 | 0 | 0 |
| Manual Tools | 14 | 5 | 1 |
| Others | 33 | 176 | 20 |
| Plates | 1 | 0 | 0 |
| Poll | 0 | 0 | 1 |
| Power lock | 1 | 2 | 0 |
| Pressed | 17 | 7 | 0 |
| Pressurized Systems | 6 | 1 | 0 |
| Pressurized Systems / Chemical Substances | 3 | 0 | 0 |
| Projection | 4 | 9 | 0 |
| Projection of fragments | 0 | 1 | 0 |
| Projection/Burning | 1 | 0 | 0 |
| Projection/Choco | 0 | 0 | 1 |
| Projection/Manual Tools | 0 | 1 | 0 |
| Suspended Loads | 5 | 1 | 0 |
| Traffic | 0 | 0 | 1 |
| Vehicles and Mobile Equipment | 0 | 8 | 0 |
| Venomous Animals | 2 | 1 | 10 |
| remains of choco | 0 | 7 | 0 |
From the above count plot, it is evident that maximum number of accidents happened in mining with a critical risk of others. i.e- about 175
sns.countplot(x="Local",hue="Industry Sector", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Local')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Local ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Industry Sector | ||||||||||||
| Metals | 0 | 0 | 0 | 0 | 59 | 46 | 0 | 27 | 2 | 0 | 0 | 0 |
| Mining | 56 | 23 | 89 | 55 | 0 | 0 | 14 | 0 | 0 | 0 | 0 | 0 |
| Others | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | 2 | 4 |
Many accidents happened with a local 3 and industrial sector mining. i.e- more than 80. Least accidents took place with local 11 and industrial sector others.
#Count plot to determine the number of accidents taken place in year 2016 and 2017 according to the industrial sector
sns.countplot(x="Year",hue="Industry Sector", data=data)
<AxesSubplot:xlabel='Year', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Industry Sector | ||
| Metals | 97 | 37 |
| Mining | 159 | 78 |
| Others | 27 | 20 |
1.The number of accidents taken place in year 2016 for mining sector is 160.
2.The number of accidents taken place in year 2016 wrt metals sector is about 100.
3.The number of accidents taken place in the year 2016 wrt others sector is about 30. Hence, it can be determined that maximum accidents took place in mining sector in the year 2016.
4.The number of accidents taken place in the year 2017 wrt mining sector is 80.
5.The number of accidents taken place in the year 2017 wrt metals sector is about 40.
6.The number of accidents taken place in the year 2017 wrt others sector is 20.
Hence, it can be determined that max accidents took place in mining sector in the year 2017
#Count plot to determine the accidents taken place in all the months wrt industrial sector
sns.countplot(x="Industry Sector",hue="Month", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a26d80040>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Month'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Month')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Month ------------------------------------------------------
| Month | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Industry Sector | ||||||||||||
| Metals | 13 | 19 | 17 | 15 | 12 | 11 | 9 | 9 | 11 | 5 | 5 | 8 |
| Mining | 24 | 38 | 26 | 31 | 20 | 28 | 13 | 10 | 10 | 15 | 7 | 15 |
| Others | 2 | 4 | 7 | 5 | 8 | 12 | 2 | 2 | 3 | 1 | 1 | 0 |
Maximum number of accidents happened in the month feb and mining sector. The least number of accidents took place in the others sector and month december.
sns.countplot(x="Industry Sector",hue="Weekday", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Weekday'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Weekday')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Weekday ------------------------------------------------------
| Weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| Industry Sector | |||||||
| Metals | 20 | 24 | 6 | 15 | 25 | 21 | 23 |
| Mining | 40 | 26 | 44 | 25 | 37 | 36 | 29 |
| Others | 1 | 3 | 6 | 1 | 14 | 12 | 10 |
Maximum number of accidents hapenned on the day saturday in the mining sector. i.e- more than 40. The least number of accidents happened on the day sunday in the others sector.
sns.countplot(x="Industry Sector",hue="Country", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Country')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Country ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Industry Sector | |||
| Metals | 46 | 88 | 0 |
| Mining | 200 | 37 | 0 |
| Others | 2 | 4 | 41 |
From the above count plot, it is evident that the maximum number of accidents took place in country_01 and mining sector.i.e- 200. The least number of accidents took place in country _01 and others sector.
sns.countplot(x="Industry Sector",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Industry Sector', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Industry Sector'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Industry Sector',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Industry Sector and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Industry Sector | |||
| Metals | 76 | 31 | 27 |
| Mining | 89 | 120 | 28 |
| Others | 13 | 34 | 0 |
From the above count plot, it is clearly evident that the maximum accidents took place in the mining sector with the third party employee type. i.e- about 120. The least number of accidents took place in the others sectors with the nature of employee as employee.
sns.countplot(x="Country",hue="Year", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Country | ||
| Country_01 | 174 | 74 |
| Country_02 | 86 | 43 |
| Country_03 | 23 | 18 |
From the above output, the following can be determined-
1.The number of accidents taken place in country_01 and year 2016 is 174.
2.The number of accidents taken place in country_01 and year 2017 is about 74.
3.The number of accidents taken place in country_02 and year 2016 is more than 86.
4.The number of accidents taken place in country_02 and year 2017 is about 43.
5.The number of accidents taken place in country_03 and year 2016 is about 23.
6.The number of accidents taken place in country_03 and year 2017 is about 18.
sns.countplot(x="Country",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Accident Level ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Country | |||||
| Country_01 | 177 | 19 | 21 | 23 | 8 |
| Country_02 | 98 | 19 | 7 | 5 | 0 |
| Country_03 | 34 | 2 | 3 | 2 | 0 |
From the above count plot, it is clearly evident that the maximum number of accidents took place in accident level 1 and country_01.
sns.countplot(x="Country",hue="Potential Accident Level", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Potential Accident Level'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Potential Accident Level ------------------------------------------------------
| Potential Accident Level | I | II | III | IV | V | VI |
|---|---|---|---|---|---|---|
| Country | ||||||
| Country_01 | 10 | 51 | 64 | 101 | 21 | 1 |
| Country_02 | 6 | 40 | 41 | 33 | 9 | 0 |
| Country_03 | 29 | 4 | 1 | 7 | 0 | 0 |
From the above plot, it is evident that the maximum accidents occurred in country_01 and potential accident level 3.
sns.countplot(x="Country",hue="Local", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a28263070>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Local')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Local ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Country | ||||||||||||
| Country_01 | 56 | 0 | 89 | 55 | 0 | 46 | 0 | 0 | 0 | 0 | 2 | 0 |
| Country_02 | 0 | 23 | 0 | 0 | 59 | 0 | 14 | 27 | 2 | 0 | 0 | 4 |
| Country_03 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 41 | 0 | 0 |
Country 1 is more dominant in local 3 region and least dominant in Local 12
sns.countplot(x="Country",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Country'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Country | |||
| Country_01 | 87 | 138 | 23 |
| Country_02 | 84 | 13 | 32 |
| Country_03 | 7 | 34 | 0 |
Accidents in Country 01 is more dominant in Third Party type of employee, country 03 is least dominant in Third Party (Remote)
sns.countplot(x="Country",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a2836ba90>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Country',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Country and Critical Risk ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Critical Risk | |||
| \nNot applicable | 0 | 1 | 0 |
| Bees | 0 | 0 | 10 |
| Blocking and isolation of energies | 1 | 2 | 0 |
| Burn | 0 | 1 | 0 |
| Chemical substances | 4 | 13 | 0 |
| Confined space | 0 | 1 | 0 |
| Cut | 5 | 9 | 0 |
| Electrical Shock | 2 | 0 | 0 |
| Electrical installation | 1 | 0 | 0 |
| Fall | 6 | 1 | 2 |
| Fall prevention | 3 | 2 | 1 |
| Fall prevention (same level) | 5 | 2 | 0 |
| Individual protection equipment | 1 | 0 | 0 |
| Liquid Metal | 0 | 3 | 0 |
| Machine Protection | 1 | 1 | 0 |
| Manual Tools | 7 | 12 | 1 |
| Others | 169 | 45 | 15 |
| Plates | 1 | 0 | 0 |
| Poll | 0 | 0 | 1 |
| Power lock | 3 | 0 | 0 |
| Pressed | 9 | 15 | 0 |
| Pressurized Systems | 1 | 6 | 0 |
| Pressurized Systems / Chemical Substances | 2 | 1 | 0 |
| Projection | 9 | 4 | 0 |
| Projection of fragments | 1 | 0 | 0 |
| Projection/Burning | 0 | 1 | 0 |
| Projection/Choco | 0 | 0 | 1 |
| Projection/Manual Tools | 1 | 0 | 0 |
| Suspended Loads | 3 | 3 | 0 |
| Traffic | 0 | 1 | 0 |
| Vehicles and Mobile Equipment | 7 | 1 | 0 |
| Venomous Animals | 0 | 3 | 10 |
| remains of choco | 6 | 1 | 0 |
Country 01 is more dominant in Others Critical Risk and Critical Risk is least dominant in Country 03
sns.countplot(x="Local",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Accident Level ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accident Level | ||||||||||||
| I | 45 | 14 | 65 | 30 | 51 | 36 | 9 | 19 | 1 | 34 | 1 | 4 |
| II | 1 | 6 | 8 | 9 | 6 | 1 | 2 | 5 | 0 | 2 | 0 | 0 |
| III | 5 | 2 | 5 | 7 | 2 | 3 | 1 | 1 | 1 | 3 | 1 | 0 |
| IV | 4 | 1 | 8 | 6 | 0 | 5 | 2 | 2 | 0 | 2 | 0 | 0 |
| V | 1 | 0 | 3 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
Accident level 1 is more dominant in Local 2 region with 65 accidents, while Accident Level V is least across all Locals
sns.countplot(x="Local",hue="Potential Accident Level", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a285bc0d0>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Potential Accident Level ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Potential Accident Level | ||||||||||||
| I | 2 | 1 | 2 | 2 | 1 | 4 | 1 | 2 | 0 | 29 | 0 | 1 |
| II | 7 | 6 | 12 | 11 | 15 | 20 | 4 | 12 | 1 | 4 | 1 | 2 |
| III | 13 | 6 | 21 | 19 | 25 | 11 | 2 | 8 | 0 | 1 | 0 | 0 |
| IV | 30 | 7 | 41 | 19 | 18 | 10 | 2 | 4 | 1 | 7 | 1 | 1 |
| V | 4 | 3 | 12 | 4 | 0 | 1 | 5 | 1 | 0 | 0 | 0 | 0 |
| VI | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Overall Local 3 is more prone to Multiple potential accidents, while local 12 is the least
sns.countplot(x="Local",hue="Natureofemployee", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a28934550>
bivariate_analysis_df = pd.crosstab(index=data['Natureofemployee'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Natureofemployee ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Natureofemployee | ||||||||||||
| Employee | 23 | 11 | 30 | 14 | 37 | 18 | 11 | 19 | 2 | 7 | 2 | 4 |
| Third Party | 30 | 1 | 48 | 40 | 9 | 20 | 1 | 2 | 0 | 34 | 0 | 0 |
| Third Party (Remote) | 3 | 11 | 11 | 1 | 13 | 8 | 2 | 6 | 0 | 0 | 0 | 0 |
Type Employee is more dominant across all Locals, while Type Third Party(Remote) is least dominant across all Locals
sns.countplot(x="Local",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a289cc520>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Critical Risk ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Critical Risk | ||||||||||||
| \nNot applicable | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| Bees | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 0 |
| Blocking and isolation of energies | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| Burn | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Chemical substances | 1 | 1 | 0 | 0 | 11 | 3 | 0 | 1 | 0 | 0 | 0 | 0 |
| Confined space | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Cut | 0 | 0 | 2 | 1 | 7 | 2 | 1 | 1 | 0 | 0 | 0 | 0 |
| Electrical Shock | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Electrical installation | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Fall | 2 | 0 | 1 | 2 | 1 | 1 | 0 | 0 | 0 | 2 | 0 | 0 |
| Fall prevention | 0 | 0 | 1 | 0 | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 0 |
| Fall prevention (same level) | 0 | 0 | 0 | 1 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
| Individual protection equipment | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Liquid Metal | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| Machine Protection | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| Manual Tools | 1 | 2 | 1 | 1 | 5 | 4 | 0 | 4 | 1 | 1 | 0 | 0 |
| Others | 41 | 16 | 68 | 43 | 10 | 15 | 8 | 8 | 0 | 15 | 2 | 3 |
| Plates | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| Poll | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| Power lock | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| Pressed | 2 | 2 | 1 | 2 | 6 | 4 | 0 | 7 | 0 | 0 | 0 | 0 |
| Pressurized Systems | 0 | 1 | 0 | 0 | 3 | 1 | 0 | 2 | 0 | 0 | 0 | 0 |
| Pressurized Systems / Chemical Substances | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| Projection | 3 | 0 | 4 | 0 | 1 | 2 | 2 | 1 | 0 | 0 | 0 | 0 |
| Projection of fragments | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Projection/Burning | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Projection/Choco | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| Projection/Manual Tools | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Suspended Loads | 1 | 0 | 0 | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| Traffic | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| Vehicles and Mobile Equipment | 4 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Venomous Animals | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 10 | 0 | 0 |
| remains of choco | 0 | 0 | 4 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
Critical Risk of type "Others" is dominant across all Locals
sns.countplot(x="Local",hue="Year", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Year ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | ||||||||||||
| 2016 | 38 | 13 | 63 | 38 | 45 | 33 | 7 | 18 | 1 | 23 | 2 | 2 |
| 2017 | 18 | 10 | 26 | 17 | 14 | 13 | 7 | 9 | 1 | 18 | 0 | 2 |
Year 2016 has more accidents across all Local regions compared to 2017
sns.countplot(x="Accident Level",hue="Potential Accident Level", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Potential Accident Level')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Potential Accident Level ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Potential Accident Level | |||||
| I | 45 | 0 | 0 | 0 | 0 |
| II | 88 | 7 | 0 | 0 | 0 |
| III | 89 | 14 | 3 | 0 | 0 |
| IV | 78 | 16 | 26 | 21 | 0 |
| V | 9 | 3 | 2 | 9 | 7 |
| VI | 0 | 0 | 0 | 0 | 1 |
Accident Level I is more related to Potential Accident levels of I, II, III, IV, V, VI
sns.countplot(x="Accident Level",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Accident Level | |||
| I | 139 | 130 | 40 |
| II | 15 | 19 | 6 |
| III | 14 | 14 | 3 |
| IV | 10 | 16 | 4 |
| V | 0 | 6 | 2 |
Accident Level I is more dominant across all Employee types, where Level V is least across all types
sns.countplot(x="Accident Level",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a2924ed00>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Critical Risk ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Critical Risk | |||||
| \nNot applicable | 0 | 0 | 0 | 1 | 0 |
| Bees | 10 | 0 | 0 | 0 | 0 |
| Blocking and isolation of energies | 3 | 0 | 0 | 0 | 0 |
| Burn | 0 | 0 | 1 | 0 | 0 |
| Chemical substances | 15 | 2 | 0 | 0 | 0 |
| Confined space | 1 | 0 | 0 | 0 | 0 |
| Cut | 11 | 2 | 1 | 0 | 0 |
| Electrical Shock | 2 | 0 | 0 | 0 | 0 |
| Electrical installation | 0 | 0 | 0 | 1 | 0 |
| Fall | 6 | 0 | 0 | 2 | 1 |
| Fall prevention | 5 | 0 | 0 | 1 | 0 |
| Fall prevention (same level) | 6 | 0 | 0 | 1 | 0 |
| Individual protection equipment | 0 | 1 | 0 | 0 | 0 |
| Liquid Metal | 3 | 0 | 0 | 0 | 0 |
| Machine Protection | 2 | 0 | 0 | 0 | 0 |
| Manual Tools | 12 | 5 | 3 | 0 | 0 |
| Others | 169 | 21 | 23 | 13 | 3 |
| Plates | 1 | 0 | 0 | 0 | 0 |
| Poll | 0 | 0 | 0 | 1 | 0 |
| Power lock | 0 | 0 | 0 | 1 | 2 |
| Pressed | 17 | 1 | 2 | 4 | 0 |
| Pressurized Systems | 6 | 1 | 0 | 0 | 0 |
| Pressurized Systems / Chemical Substances | 2 | 1 | 0 | 0 | 0 |
| Projection | 10 | 2 | 0 | 1 | 0 |
| Projection of fragments | 1 | 0 | 0 | 0 | 0 |
| Projection/Burning | 0 | 1 | 0 | 0 | 0 |
| Projection/Choco | 1 | 0 | 0 | 0 | 0 |
| Projection/Manual Tools | 1 | 0 | 0 | 0 | 0 |
| Suspended Loads | 4 | 0 | 1 | 1 | 0 |
| Traffic | 1 | 0 | 0 | 0 | 0 |
| Vehicles and Mobile Equipment | 5 | 1 | 0 | 1 | 1 |
| Venomous Animals | 13 | 0 | 0 | 0 | 0 |
| remains of choco | 2 | 2 | 0 | 2 | 1 |
Accident Level I is more domaint with Other critical Risk type
sns.countplot(x="Accident Level",hue="Year", data=data)
<AxesSubplot:xlabel='Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Accident Level'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Year ------------------------------------------------------
| Accident Level | I | II | III | IV | V |
|---|---|---|---|---|---|
| Year | |||||
| 2016 | 211 | 26 | 24 | 19 | 3 |
| 2017 | 98 | 14 | 7 | 11 | 5 |
Accident Level I is more dominant in across 2016 and 2017 years, and Level V is minimum
sns.countplot(x="Month",hue="Accident Level", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a295a2130>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Month'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Month')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Month ------------------------------------------------------
| Month | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accident Level | ||||||||||||
| I | 32 | 42 | 34 | 43 | 31 | 41 | 16 | 15 | 18 | 11 | 9 | 17 |
| II | 2 | 9 | 7 | 2 | 3 | 3 | 1 | 3 | 3 | 4 | 1 | 2 |
| III | 2 | 4 | 3 | 3 | 1 | 2 | 4 | 2 | 2 | 4 | 1 | 3 |
| IV | 2 | 5 | 3 | 3 | 4 | 4 | 2 | 1 | 1 | 2 | 2 | 1 |
| V | 1 | 1 | 3 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
Accident Level 1 dominates across all Months while Level V is minimum
sns.countplot(x="Country",hue="Accident Level", data=data)
<AxesSubplot:xlabel='Country', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Accident Level'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Accident Level',' and ', 'Country')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Accident Level and Country ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Accident Level | |||
| I | 177 | 98 | 34 |
| II | 19 | 19 | 2 |
| III | 21 | 7 | 3 |
| IV | 23 | 5 | 2 |
| V | 8 | 0 | 0 |
Accident Level I is more dominant across all Countries, while Accident Level V is least dominant across all countries
sns.countplot(x="Potential Accident Level",hue="Natureofemployee", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Potential Accident Level | |||
| I | 12 | 29 | 4 |
| II | 44 | 37 | 14 |
| III | 53 | 35 | 18 |
| IV | 58 | 68 | 15 |
| V | 11 | 15 | 4 |
| VI | 0 | 1 | 0 |
Potential Accident level IV dominents in ThirdParty, while VI is least dominant in Third Party(Remote) across all
sns.countplot(x="Potential Accident Level",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a29939ac0>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Potential Accident Level'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Critical Risk')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Critical Risk ------------------------------------------------------
| Potential Accident Level | I | II | III | IV | V | VI |
|---|---|---|---|---|---|---|
| Critical Risk | ||||||
| \nNot applicable | 0 | 0 | 0 | 0 | 1 | 0 |
| Bees | 10 | 0 | 0 | 0 | 0 | 0 |
| Blocking and isolation of energies | 0 | 1 | 2 | 0 | 0 | 0 |
| Burn | 0 | 0 | 0 | 1 | 0 | 0 |
| Chemical substances | 0 | 5 | 8 | 4 | 0 | 0 |
| Confined space | 0 | 0 | 1 | 0 | 0 | 0 |
| Cut | 1 | 6 | 5 | 2 | 0 | 0 |
| Electrical Shock | 0 | 0 | 0 | 2 | 0 | 0 |
| Electrical installation | 0 | 0 | 0 | 0 | 1 | 0 |
| Fall | 1 | 1 | 4 | 2 | 1 | 0 |
| Fall prevention | 1 | 0 | 0 | 5 | 0 | 0 |
| Fall prevention (same level) | 1 | 1 | 3 | 2 | 0 | 0 |
| Individual protection equipment | 0 | 0 | 0 | 1 | 0 | 0 |
| Liquid Metal | 1 | 0 | 0 | 2 | 0 | 0 |
| Machine Protection | 0 | 0 | 2 | 0 | 0 | 0 |
| Manual Tools | 2 | 5 | 9 | 4 | 0 | 0 |
| Others | 16 | 60 | 53 | 85 | 15 | 0 |
| Plates | 0 | 1 | 0 | 0 | 0 | 0 |
| Poll | 0 | 0 | 0 | 1 | 0 | 0 |
| Power lock | 0 | 0 | 0 | 0 | 3 | 0 |
| Pressed | 2 | 5 | 9 | 7 | 1 | 0 |
| Pressurized Systems | 0 | 2 | 3 | 2 | 0 | 0 |
| Pressurized Systems / Chemical Substances | 0 | 1 | 0 | 2 | 0 | 0 |
| Projection | 0 | 2 | 2 | 7 | 2 | 0 |
| Projection of fragments | 0 | 0 | 0 | 1 | 0 | 0 |
| Projection/Burning | 0 | 0 | 0 | 1 | 0 | 0 |
| Projection/Choco | 0 | 1 | 0 | 0 | 0 | 0 |
| Projection/Manual Tools | 0 | 0 | 1 | 0 | 0 | 0 |
| Suspended Loads | 0 | 1 | 0 | 5 | 0 | 0 |
| Traffic | 0 | 1 | 0 | 0 | 0 | 0 |
| Vehicles and Mobile Equipment | 0 | 0 | 2 | 2 | 4 | 0 |
| Venomous Animals | 10 | 2 | 1 | 0 | 0 | 0 |
| remains of choco | 0 | 0 | 1 | 3 | 2 | 1 |
Among all Critical Risk with Type as "Others" is dominant across all Potential Accident Levels
plt.figure(figsize=(10,6))
sns.barplot(data['Accident Level'], data['Month'], hue=data['Year'], palette='muted')
<AxesSubplot:xlabel='Accident Level', ylabel='Month'>
sns.countplot(x="Potential Accident Level",hue="Year", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Potential Accident Level | ||
| I | 26 | 19 |
| II | 69 | 26 |
| III | 75 | 31 |
| IV | 97 | 44 |
| V | 16 | 14 |
| VI | 0 | 1 |
There is Decrease in Number of accidents across all Potential Accident level from 2016 to 2017. Potential Accident level IV is dominant in both 2016 and 2017
sns.countplot(x="Potential Accident Level",hue="Country", data=data)
<AxesSubplot:xlabel='Potential Accident Level', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Potential Accident Level'],columns=data['Country'])
print("\n Cross table Analysis of features: ",'Potential Accident Level',' and ', 'Country')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Potential Accident Level and Country ------------------------------------------------------
| Country | Country_01 | Country_02 | Country_03 |
|---|---|---|---|
| Potential Accident Level | |||
| I | 10 | 6 | 29 |
| II | 51 | 40 | 4 |
| III | 64 | 41 | 1 |
| IV | 101 | 33 | 7 |
| V | 21 | 9 | 0 |
| VI | 1 | 0 | 0 |
Potential Accident Level IV is dominant across all countries, while with VI least number of accidents happenned
sns.countplot(x="Natureofemployee",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a29eebe20>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Natureofemployee'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Natureofemployee')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Natureofemployee ------------------------------------------------------
| Natureofemployee | Employee | Third Party | Third Party (Remote) |
|---|---|---|---|
| Critical Risk | |||
| \nNot applicable | 1 | 0 | 0 |
| Bees | 1 | 9 | 0 |
| Blocking and isolation of energies | 2 | 0 | 1 |
| Burn | 1 | 0 | 0 |
| Chemical substances | 9 | 2 | 6 |
| Confined space | 1 | 0 | 0 |
| Cut | 8 | 5 | 1 |
| Electrical Shock | 0 | 0 | 2 |
| Electrical installation | 0 | 1 | 0 |
| Fall | 0 | 5 | 4 |
| Fall prevention | 2 | 3 | 1 |
| Fall prevention (same level) | 3 | 4 | 0 |
| Individual protection equipment | 0 | 1 | 0 |
| Liquid Metal | 3 | 0 | 0 |
| Machine Protection | 1 | 1 | 0 |
| Manual Tools | 7 | 7 | 6 |
| Others | 99 | 109 | 21 |
| Plates | 1 | 0 | 0 |
| Poll | 0 | 1 | 0 |
| Power lock | 0 | 0 | 3 |
| Pressed | 12 | 7 | 5 |
| Pressurized Systems | 4 | 1 | 2 |
| Pressurized Systems / Chemical Substances | 1 | 1 | 1 |
| Projection | 7 | 6 | 0 |
| Projection of fragments | 0 | 1 | 0 |
| Projection/Burning | 1 | 0 | 0 |
| Projection/Choco | 0 | 1 | 0 |
| Projection/Manual Tools | 0 | 1 | 0 |
| Suspended Loads | 4 | 1 | 1 |
| Traffic | 1 | 0 | 0 |
| Vehicles and Mobile Equipment | 3 | 5 | 0 |
| Venomous Animals | 3 | 9 | 1 |
| remains of choco | 3 | 4 | 0 |
Critical Risk of type "Others" is dominant across all Types of Employees
sns.countplot(x="Month",hue="Year", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a2b0c1d30>
bivariate_analysis_df = pd.crosstab(index=data['Month'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Month',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Month and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Month | ||
| 1 | 12 | 27 |
| 2 | 31 | 30 |
| 3 | 34 | 16 |
| 4 | 28 | 23 |
| 5 | 26 | 14 |
| 6 | 31 | 20 |
| 7 | 19 | 5 |
| 8 | 21 | 0 |
| 9 | 24 | 0 |
| 10 | 21 | 0 |
| 11 | 13 | 0 |
| 12 | 23 | 0 |
From the above plot, it is evident that the max accidents happened in the year 2016 and march.
sns.countplot(x="Local",hue="Year", data=data)
<AxesSubplot:xlabel='Local', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Local'])
print("\n Cross table Analysis of features: ",'Local',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Local and Year ------------------------------------------------------
| Local | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | ||||||||||||
| 2016 | 38 | 13 | 63 | 38 | 45 | 33 | 7 | 18 | 1 | 23 | 2 | 2 |
| 2017 | 18 | 10 | 26 | 17 | 14 | 13 | 7 | 9 | 1 | 18 | 0 | 2 |
From the above plot, it can be determined that the maximum accidents took place in the local 3 and year 2016.
sns.countplot(x="Weekday",hue="Year", data=data)
<AxesSubplot:xlabel='Weekday', ylabel='count'>
bivariate_analysis_df = pd.crosstab(index=data['Year'],columns=data['Weekday'])
print("\n Cross table Analysis of features: ",'Weekday',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Weekday and Year ------------------------------------------------------
| Weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| Year | |||||||
| 2016 | 44 | 40 | 36 | 25 | 58 | 40 | 40 |
| 2017 | 17 | 13 | 20 | 16 | 18 | 29 | 22 |
From the above plot, it is clearly evident that maximum number of accidents took place on thursday and year 2016.
sns.countplot(x="Year",hue="Critical Risk", data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
<matplotlib.legend.Legend at 0x18a2b3d26a0>
bivariate_analysis_df = pd.crosstab(index=data['Critical Risk'],columns=data['Year'])
print("\n Cross table Analysis of features: ",'Critical Risk',' and ', 'Year')
print("------------------------------------------------------")
display(bivariate_analysis_df)
Cross table Analysis of features: Critical Risk and Year ------------------------------------------------------
| Year | 2016 | 2017 |
|---|---|---|
| Critical Risk | ||
| \nNot applicable | 1 | 0 |
| Bees | 10 | 0 |
| Blocking and isolation of energies | 3 | 0 |
| Burn | 0 | 1 |
| Chemical substances | 13 | 4 |
| Confined space | 1 | 0 |
| Cut | 6 | 8 |
| Electrical Shock | 0 | 2 |
| Electrical installation | 1 | 0 |
| Fall | 2 | 7 |
| Fall prevention | 1 | 5 |
| Fall prevention (same level) | 6 | 1 |
| Individual protection equipment | 0 | 1 |
| Liquid Metal | 2 | 1 |
| Machine Protection | 0 | 2 |
| Manual Tools | 14 | 6 |
| Others | 189 | 40 |
| Plates | 1 | 0 |
| Poll | 1 | 0 |
| Power lock | 0 | 3 |
| Pressed | 14 | 10 |
| Pressurized Systems | 7 | 0 |
| Pressurized Systems / Chemical Substances | 3 | 0 |
| Projection | 1 | 12 |
| Projection of fragments | 0 | 1 |
| Projection/Burning | 0 | 1 |
| Projection/Choco | 0 | 1 |
| Projection/Manual Tools | 0 | 1 |
| Suspended Loads | 5 | 1 |
| Traffic | 1 | 0 |
| Vehicles and Mobile Equipment | 0 | 8 |
| Venomous Animals | 1 | 12 |
| remains of choco | 0 | 7 |
From the above plot, it is clearly evident that maximum number of accidents took place with "Others" and year 2016.
#Pair plot
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x18a2b59abb0>
data.corr()
| Local | Year | Month | |
|---|---|---|---|
| Local | 1.000000 | 0.054246 | 0.019061 |
| Year | 0.054246 | 1.000000 | -0.416621 |
| Month | 0.019061 | -0.416621 | 1.000000 |
From the above Correlation diagram its clear that "Local" and "Year" are moderately correlated
data.columns
Index(['Date', 'Country', 'Local', 'Industry Sector', 'Accident Level',
'Potential Accident Level', 'Gender', 'Natureofemployee',
'Critical Risk', 'Description', 'Year', 'Month', 'Weekday', 'Season'],
dtype='object')
data.groupby(['Year','Accident Level','Potential Accident Level'])[['Accident Level']].count()
| Accident Level | |||
|---|---|---|---|
| Year | Accident Level | Potential Accident Level | |
| 2016 | I | I | 26 |
| II | 62 | ||
| III | 64 | ||
| IV | 53 | ||
| V | 6 | ||
| II | II | 7 | |
| III | 9 | ||
| IV | 10 | ||
| III | III | 2 | |
| IV | 20 | ||
| V | 2 | ||
| IV | IV | 14 | |
| V | 5 | ||
| V | V | 3 | |
| 2017 | I | I | 19 |
| II | 26 | ||
| III | 25 | ||
| IV | 25 | ||
| V | 3 | ||
| II | III | 5 | |
| IV | 6 | ||
| V | 3 | ||
| III | III | 1 | |
| IV | 6 | ||
| IV | IV | 7 | |
| V | 4 | ||
| V | V | 4 | |
| VI | 1 |
Year 2016 with Accident Level I has maximum accidents of 64 with Potential Accident Level III and 62 with Potential Accident Level II
Year 2017 with Accident Level I has maximum accidents of 26 with Potential Accident Level II and 25 with Potential Accident Level III,IV
data.groupby(['Year','Industry Sector','Accident Level'])[['Accident Level']].count()
| Accident Level | |||
|---|---|---|---|
| Year | Industry Sector | Accident Level | |
| 2016 | Metals | I | 79 |
| II | 9 | ||
| III | 4 | ||
| IV | 5 | ||
| Mining | I | 112 | |
| II | 15 | ||
| III | 17 | ||
| IV | 12 | ||
| V | 3 | ||
| Others | I | 20 | |
| II | 2 | ||
| III | 3 | ||
| IV | 2 | ||
| 2017 | Metals | I | 28 |
| II | 3 | ||
| III | 3 | ||
| IV | 2 | ||
| V | 1 | ||
| Mining | I | 51 | |
| II | 11 | ||
| III | 3 | ||
| IV | 9 | ||
| V | 4 | ||
| Others | I | 19 | |
| III | 1 |
Year 2016 with Industry Sector of Type "Others" has maximum accidents of 20 with Accident Level I
Year 2017 with Industry Sector of Type "Metals" has maximum accidents of 28 with Accident Level I
data.groupby(['Industry Sector','Country','Accident Level'])[['Accident Level']].count()
| Accident Level | |||
|---|---|---|---|
| Industry Sector | Country | Accident Level | |
| Metals | Country_01 | I | 36 |
| II | 1 | ||
| III | 3 | ||
| IV | 5 | ||
| V | 1 | ||
| Country_02 | I | 71 | |
| II | 11 | ||
| III | 4 | ||
| IV | 2 | ||
| Mining | Country_01 | I | 140 |
| II | 18 | ||
| III | 17 | ||
| IV | 18 | ||
| V | 7 | ||
| Country_02 | I | 23 | |
| II | 8 | ||
| III | 3 | ||
| IV | 3 | ||
| Others | Country_01 | I | 1 |
| III | 1 | ||
| Country_02 | I | 4 | |
| Country_03 | I | 34 | |
| II | 2 | ||
| III | 3 | ||
| IV | 2 |
#!pip install wordcloud
#!pip install pandas_profiling
from wordcloud import WordCloud
for i in data['Accident Level'].unique():
print('WordCloud for Accident Level :', i,'\n')
text = " ".join(cat.split()[1] for cat in data[data['Accident Level'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Accident Level : I ----------------------------- WordCloud for Accident Level : IV ----------------------------- WordCloud for Accident Level : III ----------------------------- WordCloud for Accident Level : II ----------------------------- WordCloud for Accident Level : V -----------------------------
for i in data['Potential Accident Level'].unique():
if i != 'VI':
print('WordCloud for Potential Accident Level :', str(i),'\n')
text = " ".join(cat.split()[1] for cat in data[data['Potential Accident Level'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Potential Accident Level : IV ----------------------------- WordCloud for Potential Accident Level : III ----------------------------- WordCloud for Potential Accident Level : I ----------------------------- WordCloud for Potential Accident Level : II ----------------------------- WordCloud for Potential Accident Level : V -----------------------------
for i in data['Industry Sector'].unique():
print('WordCloud for Industry type :', str(i),'\n')
text = " ".join(cat.split()[1] for cat in data[data['Industry Sector'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Industry type : Mining ----------------------------- WordCloud for Industry type : Metals ----------------------------- WordCloud for Industry type : Others -----------------------------
for i in data['Country'].unique():
print('WordCloud for Country :', i,'\n')
text = " ".join(cat.split()[1] for cat in data[data['Country'] == i]['Description'])
# Creating word_cloud with text as argument in .generate() method
word_cloud = WordCloud(collocations = False, background_color = 'lightyellow').generate(text)
# Display the generated Word Cloud
plt.figure(figsize=[10,10])
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show
print('-----------------------------')
WordCloud for Country : Country_01 ----------------------------- WordCloud for Country : Country_02 ----------------------------- WordCloud for Country : Country_03 -----------------------------
!pip install pandas-profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report")
profile.to_notebook_iframe()
Collecting pandas-profiling Using cached pandas_profiling-3.1.0-py2.py3-none-any.whl (261 kB) Collecting multimethod>=1.4 Using cached multimethod-1.8-py3-none-any.whl (9.8 kB) Requirement already satisfied: seaborn>=0.10.1 in d:\anaconda\lib\site-packages (from pandas-profiling) (0.11.1) Requirement already satisfied: numpy>=1.16.0 in d:\anaconda\lib\site-packages (from pandas-profiling) (1.20.1) Collecting phik>=0.11.1 Using cached phik-0.12.2-cp38-cp38-win_amd64.whl (677 kB) Requirement already satisfied: tqdm>=4.48.2 in d:\anaconda\lib\site-packages (from pandas-profiling) (4.59.0) Collecting visions[type_image_path]==0.7.4 Using cached visions-0.7.4-py3-none-any.whl (102 kB) Collecting missingno>=0.4.2 Using cached missingno-0.5.1-py3-none-any.whl (8.7 kB) Requirement already satisfied: jinja2>=2.11.1 in d:\anaconda\lib\site-packages (from pandas-profiling) (2.11.3) Requirement already satisfied: matplotlib>=3.2.0 in d:\anaconda\lib\site-packages (from pandas-profiling) (3.3.4) Collecting markupsafe~=2.0.1 Using cached MarkupSafe-2.0.1-cp38-cp38-win_amd64.whl (14 kB) Collecting htmlmin>=0.1.12 Using cached htmlmin-0.1.12.tar.gz (19 kB) Collecting tangled-up-in-unicode==0.1.0 Using cached tangled_up_in_unicode-0.1.0-py3-none-any.whl (3.1 MB) Requirement already satisfied: requests>=2.24.0 in d:\anaconda\lib\site-packages (from pandas-profiling) (2.25.1) Collecting pydantic>=1.8.1 Using cached pydantic-1.9.0-cp38-cp38-win_amd64.whl (2.1 MB) Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3 in d:\anaconda\lib\site-packages (from pandas-profiling) (1.2.4) Requirement already satisfied: scipy>=1.4.1 in d:\anaconda\lib\site-packages (from pandas-profiling) (1.6.2) Requirement already satisfied: joblib~=1.0.1 in d:\anaconda\lib\site-packages (from pandas-profiling) (1.0.1) Requirement already satisfied: PyYAML>=5.0.0 in d:\anaconda\lib\site-packages (from pandas-profiling) (5.4.1) Requirement already satisfied: networkx>=2.4 in d:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling) (2.5) Requirement already satisfied: attrs>=19.3.0 in d:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling) (20.3.0) Collecting imagehash Using cached ImageHash-4.2.1.tar.gz (812 kB) Requirement already satisfied: Pillow in d:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling) (8.2.0) Requirement already satisfied: python-dateutil>=2.1 in d:\anaconda\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (2.8.1) Requirement already satisfied: kiwisolver>=1.0.1 in d:\anaconda\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (1.3.1) Requirement already satisfied: cycler>=0.10 in d:\anaconda\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (0.10.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in d:\anaconda\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling) (2.4.7) Requirement already satisfied: six in d:\anaconda\lib\site-packages (from cycler>=0.10->matplotlib>=3.2.0->pandas-profiling) (1.15.0) Requirement already satisfied: decorator>=4.3.0 in d:\anaconda\lib\site-packages (from networkx>=2.4->visions[type_image_path]==0.7.4->pandas-profiling) (5.0.6) Requirement already satisfied: pytz>=2017.3 in d:\anaconda\lib\site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3->pandas-profiling) (2021.1) Requirement already satisfied: typing-extensions>=3.7.4.3 in d:\anaconda\lib\site-packages (from pydantic>=1.8.1->pandas-profiling) (3.7.4.3) Requirement already satisfied: urllib3<1.27,>=1.21.1 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling) (1.26.4) Requirement already satisfied: chardet<5,>=3.0.2 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling) (4.0.0) Requirement already satisfied: idna<3,>=2.5 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling) (2.10) Requirement already satisfied: certifi>=2017.4.17 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling) (2020.12.5) Requirement already satisfied: PyWavelets in d:\anaconda\lib\site-packages (from imagehash->visions[type_image_path]==0.7.4->pandas-profiling) (1.1.1) Building wheels for collected packages: htmlmin, imagehash Building wheel for htmlmin (setup.py): started Building wheel for htmlmin (setup.py): finished with status 'done' Created wheel for htmlmin: filename=htmlmin-0.1.12-py3-none-any.whl size=27085 sha256=feb91c119b972bd3018b186be265b63a8355684e396f35535e235952ac033318 Stored in directory: c:\users\manka\appdata\local\pip\cache\wheels\23\14\6e\4be5bfeeb027f4939a01764b48edd5996acf574b0913fe5243 Building wheel for imagehash (setup.py): started Building wheel for imagehash (setup.py): finished with status 'done' Created wheel for imagehash: filename=ImageHash-4.2.1-py2.py3-none-any.whl size=295198 sha256=9806374aaeb88ac557d40843f7750f4b4d6432b85e1f9a71b60f9d50240f670b Stored in directory: c:\users\manka\appdata\local\pip\cache\wheels\48\a1\7f\096c1269d6bf78d4768180602579b35a1e8cb1250bb4b40c74
ERROR: Exception:
Traceback (most recent call last):
File "D:\anaconda\lib\site-packages\pip\_internal\cli\base_command.py", line 189, in _main
status = self.run(options, args)
File "D:\anaconda\lib\site-packages\pip\_internal\cli\req_command.py", line 178, in wrapper
return func(self, options, args)
File "D:\anaconda\lib\site-packages\pip\_internal\commands\install.py", line 343, in run
_, build_failures = build(
File "D:\anaconda\lib\site-packages\pip\_internal\wheel_builder.py", line 341, in build
wheel_file = _build_one(
File "D:\anaconda\lib\site-packages\pip\_internal\wheel_builder.py", line 241, in _build_one
_verify_one(req, wheel_path)
File "D:\anaconda\lib\site-packages\pip\_internal\wheel_builder.py", line 196, in _verify_one
with zipfile.ZipFile(wheel_path, allowZip64=True) as zf:
File "D:\anaconda\lib\zipfile.py", line 1269, in __init__
self._RealGetContents()
File "D:\anaconda\lib\zipfile.py", line 1336, in _RealGetContents
raise BadZipFile("File is not a zip file")
zipfile.BadZipFile: File is not a zip file
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_24528/1104041189.py in <module> 1 get_ipython().system('pip install pandas-profiling') ----> 2 from pandas_profiling import ProfileReport 3 profile = ProfileReport(data, title="Pandas Profiling Report") 4 profile.to_notebook_iframe() ModuleNotFoundError: No module named 'pandas_profiling'
Collecting pandas-profiling[notebook]
Downloading pandas_profiling-3.1.0-py2.py3-none-any.whl (261 kB)
Collecting htmlmin>=0.1.12
Downloading htmlmin-0.1.12.tar.gz (19 kB)
Collecting phik>=0.11.1
Downloading phik-0.12.2-cp38-cp38-win_amd64.whl (677 kB)
Requirement already satisfied: scipy>=1.4.1 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (1.6.2)
Requirement already satisfied: matplotlib>=3.2.0 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (3.3.4)
Collecting tangled-up-in-unicode==0.1.0
Downloading tangled_up_in_unicode-0.1.0-py3-none-any.whl (3.1 MB)
Requirement already satisfied: jinja2>=2.11.1 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (2.11.3)
Requirement already satisfied: PyYAML>=5.0.0 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (5.4.1)
Collecting pydantic>=1.8.1
Downloading pydantic-1.9.0-cp38-cp38-win_amd64.whl (2.1 MB)
Collecting markupsafe~=2.0.1
Downloading MarkupSafe-2.0.1-cp38-cp38-win_amd64.whl (14 kB)
Collecting visions[type_image_path]==0.7.4
Downloading visions-0.7.4-py3-none-any.whl (102 kB)
Collecting missingno>=0.4.2
Downloading missingno-0.5.1-py3-none-any.whl (8.7 kB)
Requirement already satisfied: tqdm>=4.48.2 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (4.59.0)
Collecting multimethod>=1.4
Downloading multimethod-1.8-py3-none-any.whl (9.8 kB)
Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (1.2.4)
Requirement already satisfied: joblib~=1.0.1 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (1.0.1)
Requirement already satisfied: requests>=2.24.0 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (2.25.1)
Requirement already satisfied: numpy>=1.16.0 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (1.20.1)
Requirement already satisfied: seaborn>=0.10.1 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (0.11.1)
Requirement already satisfied: ipywidgets>=7.5.1 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (7.6.3)
Requirement already satisfied: jupyter-core>=4.6.3 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (4.7.1)
Requirement already satisfied: jupyter-client>=6.0.0 in d:\anaconda\lib\site-packages (from pandas-profiling[notebook]) (6.1.12)
Requirement already satisfied: attrs>=19.3.0 in d:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (20.3.0)
Requirement already satisfied: networkx>=2.4 in d:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (2.5)
Collecting imagehash
Downloading ImageHash-4.2.1.tar.gz (812 kB)
Requirement already satisfied: Pillow in d:\anaconda\lib\site-packages (from visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (8.2.0)
Requirement already satisfied: traitlets>=4.3.1 in d:\anaconda\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (5.0.5)
Requirement already satisfied: ipython>=4.0.0 in d:\anaconda\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (7.25.0)
Requirement already satisfied: widgetsnbextension~=3.5.0 in d:\anaconda\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (3.5.1)
Requirement already satisfied: jupyterlab-widgets>=1.0.0 in d:\anaconda\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.0.0)
Requirement already satisfied: ipykernel>=4.5.1 in d:\anaconda\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.0.3)
Requirement already satisfied: nbformat>=4.2.0 in d:\anaconda\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling[notebook]) (5.1.3)
Requirement already satisfied: matplotlib-inline<0.2.0,>=0.1.0 in d:\anaconda\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.1.2)
Requirement already satisfied: tornado<7.0,>=4.2 in d:\anaconda\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.1)
Requirement already satisfied: debugpy<2.0,>=1.0.0 in d:\anaconda\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.4.1)
Requirement already satisfied: pygments in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (2.8.1)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (3.0.17)
Requirement already satisfied: setuptools>=18.5 in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (52.0.0.post20210125)
Requirement already satisfied: jedi>=0.16 in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.17.2)
Requirement already satisfied: pickleshare in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.7.5)
Requirement already satisfied: colorama in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.4.4)
Requirement already satisfied: backcall in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.2.0)
Requirement already satisfied: decorator in d:\anaconda\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (5.0.6)
Requirement already satisfied: parso<0.8.0,>=0.7.0 in d:\anaconda\lib\site-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.7.0)
Requirement already satisfied: python-dateutil>=2.1 in d:\anaconda\lib\site-packages (from jupyter-client>=6.0.0->pandas-profiling[notebook]) (2.8.1)
Requirement already satisfied: pyzmq>=13 in d:\anaconda\lib\site-packages (from jupyter-client>=6.0.0->pandas-profiling[notebook]) (20.0.0)
Requirement already satisfied: pywin32>=1.0 in d:\anaconda\lib\site-packages (from jupyter-core>=4.6.3->pandas-profiling[notebook]) (227)
Requirement already satisfied: cycler>=0.10 in d:\anaconda\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling[notebook]) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\anaconda\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling[notebook]) (1.3.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in d:\anaconda\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling[notebook]) (2.4.7)
Requirement already satisfied: six in d:\anaconda\lib\site-packages (from cycler>=0.10->matplotlib>=3.2.0->pandas-profiling[notebook]) (1.15.0)
Requirement already satisfied: ipython-genutils in d:\anaconda\lib\site-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.2.0)
Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in d:\anaconda\lib\site-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (3.2.0)
Requirement already satisfied: pyrsistent>=0.14.0 in d:\anaconda\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.17.3)
Requirement already satisfied: pytz>=2017.3 in d:\anaconda\lib\site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,!=1.1.0,>=0.25.3->pandas-profiling[notebook]) (2021.1)
Requirement already satisfied: wcwidth in d:\anaconda\lib\site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.2.5)
Requirement already satisfied: typing-extensions>=3.7.4.3 in d:\anaconda\lib\site-packages (from pydantic>=1.8.1->pandas-profiling[notebook]) (3.7.4.3)
Requirement already satisfied: idna<3,>=2.5 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (2.10)
Requirement already satisfied: certifi>=2017.4.17 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (2020.12.5)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (1.26.4)
Requirement already satisfied: chardet<5,>=3.0.2 in d:\anaconda\lib\site-packages (from requests>=2.24.0->pandas-profiling[notebook]) (4.0.0)
Requirement already satisfied: notebook>=4.4.1 in d:\anaconda\lib\site-packages (from widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.3.0)
Requirement already satisfied: Send2Trash>=1.5.0 in d:\anaconda\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.5.0)
Requirement already satisfied: argon2-cffi in d:\anaconda\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (20.1.0)
Requirement already satisfied: terminado>=0.8.3 in d:\anaconda\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.9.4)
Requirement already satisfied: nbconvert in d:\anaconda\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (6.0.7)
Requirement already satisfied: prometheus-client in d:\anaconda\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.10.1)
Requirement already satisfied: pywinpty>=0.5 in d:\anaconda\lib\site-packages (from terminado>=0.8.3->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.5.7)
Requirement already satisfied: cffi>=1.0.0 in d:\anaconda\lib\site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.14.5)
Requirement already satisfied: pycparser in d:\anaconda\lib\site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (2.20)
Requirement already satisfied: PyWavelets in d:\anaconda\lib\site-packages (from imagehash->visions[type_image_path]==0.7.4->pandas-profiling[notebook]) (1.1.1)
Requirement already satisfied: jupyterlab-pygments in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.1.2)
Requirement already satisfied: mistune<2,>=0.8.1 in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.8.4)
Requirement already satisfied: entrypoints>=0.2.2 in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.3)
Requirement already satisfied: defusedxml in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.7.1)
Requirement already satisfied: bleach in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (3.3.0)
Requirement already satisfied: pandocfilters>=1.4.1 in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.4.3)
Requirement already satisfied: testpath in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.4.4)
Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in d:\anaconda\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.5.3)
Requirement already satisfied: nest-asyncio in d:\anaconda\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.5.1)
Requirement already satisfied: async-generator in d:\anaconda\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (1.10)
Requirement already satisfied: webencodings in d:\anaconda\lib\site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (0.5.1)
Requirement already satisfied: packaging in d:\anaconda\lib\site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling[notebook]) (20.9)
Building wheels for collected packages: htmlmin, imagehash
Building wheel for htmlmin (setup.py): started
Building wheel for htmlmin (setup.py): finished with status 'done'
Created wheel for htmlmin: filename=htmlmin-0.1.12-py3-none-any.whl size=27085 sha256=feb91c119b972bd3018b186be265b63a8355684e396f35535e235952ac033318
Stored in directory: c:\users\manka\appdata\local\pip\cache\wheels\23\14\6e\4be5bfeeb027f4939a01764b48edd5996acf574b0913fe5243
Building wheel for imagehash (setup.py): started
Building wheel for imagehash (setup.py): finished with status 'done'
Created wheel for imagehash: filename=ImageHash-4.2.1-py2.py3-none-any.whl size=295198 sha256=9806374aaeb88ac557d40843f7750f4b4d6432b85e1f9a71b60f9d50240f670b
Stored in directory: c:\users\manka\appdata\local\pip\cache\wheels\48\a1\7f\096c1269d6bf78d4768180602579b35a1e8cb1250bb4b40c74
Successfully built htmlmin imagehash
Installing collected packages: markupsafe, tangled-up-in-unicode, multimethod, visions, imagehash, pydantic, phik, missingno, htmlmin, pandas-profiling
Attempting uninstall: markupsafe
Found existing installation: MarkupSafe 1.1.1
Uninstalling MarkupSafe-1.1.1:
Successfully uninstalled MarkupSafe-1.1.1
ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'D:\\anaconda\\Lib\\site-packages\\~arkupsafe\\_speedups.cp38-win_amd64.pyd' Consider using the `--user` option or check the permissions.
#Exporting the finalized dataset to csv
data.to_csv(r'data.csv', index = False)